# -*- coding: utf-8 -*-

'''
爬取京东某一频道的商品的商品名、商品价格、商品出售方、商品评论数等信息
存储到一个文件中
附加要求：
    把对应商品的评论情况爬下来，要求爬2页以上评论（如果足2页）
Created on 2018-01-05
@author: Zuolong

'''
from lxml import etree
import urllib.request
import urllib.parse
import re
import random
import io
import sys
import os

if(os.path.exists('./comment/')):
    pass
else:
    os.mkdir('comment')
    print('创建文件夹comment成功')
    

key = '零食'
key = urllib.request.quote(key)
enc = 'utf-8'

def write_error_log(error_str):
    try:
        fh=open("./error_log.txt", "a")
        fh.write(time.strftime("-" * 30 + "%Y-%m-%d %H:%M:%S",time.localtime(time.time())) + "-" * 30 + "\n")
        fh.write(str(error_str) + "\n")
        fh.close()
    except Exception as e:
        print(e)


uapools = [
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0'
    ]

def ua(uapools):
    thisua = random.choice(uapools)
    print(thisua)
    headers = ('User-Agent', thisua)
    opener = urllib.request.build_opener()
    opener.addheaders = [headers]

    #安装为全局
    urllib.request.install_opener(opener)

if __name__ == '__main__':
    result = ''
    for y in range(1, 31):
        try:
            url = 'https://search.jd.com/Search?keyword='+key+'&enc='+enc+'&&page='+str(y*2-1)
            ua(uapools)
            data = urllib.request.urlopen(url).read().decode('utf-8', 'ignore')
            print('正在爬取第'+str(y)+'页的数据。。。')
            dom_tree = etree.HTML(data)
            
            shopList = dom_tree.xpath('//li[@class="gl-item"]')

            for i in range(0, len(shopList)):
                item = etree.tostring(shopList[i], encoding="utf-8").decode('utf-8', 'ignore')
                
                #PId
                pat_pid = '<li class="gl-item" data-sku=".*?" data-spu=".*?" data-pid="(.*?)">'
                pid = re.compile(pat_pid, re.S).findall(item)
                if len(pid) == 0:
                    continue
                result += '{"商品ID":'+pid[0]+', '
                
                #PName
                pat_name = '<div class="p-name p-name-type-2">.*?<em>(.*?)</em>.*?</div>'
                pname = re.compile(pat_name, re.S).findall(item)
                p_name = pname[0].replace('<span class="p-tag" style="background-color:#c81623">京东超市</span>', '')
                p_name = p_name.replace('<font class="skcolor_ljg">', '')
                p_name = p_name.replace('</font>', '')
                p_name = p_name.replace('<img class="p-tag3" src="//img14.360buyimg.com/uba/jfs/t6919/268/501386350/1257/92e5fb39/5976fcf9Nd915775f.png"/>', '')
                result += '"商品名":'+p_name+', '
                
                #PPrice
                pat_price = '<div class="p-price">.*?<i>(.*?)</i>.*?</div>'
                p_price = re.compile(pat_price, re.S).findall(item)
                result += '"商品价格":'+p_price[0]+', '

                #PShop
                pat_shop = '<div class="p-shop".*?>.*?<span class="J_im_icon"><a .*?>(.*?)</a>'
                p_shop = re.compile(pat_shop, re.S).findall(item)
                if(len(p_shop) == 0):
                    pat_shop = '<div class="p-icons" .*?>.*?<i .*?>(.*?)</i>.*?</div>'
                    p_shop = re.compile(pat_shop, re.S).findall(item)
                if(len(p_shop) == 0):
                    pat_shop = '<span class="p-promo-flag">(.*?)</span>'
                    p_shop = re.compile(pat_shop, re.S).findall(item)
                result += '"商品出售方":'+p_shop[0]+', '
                
                #PCommit
                pat_commit = '<div class="p-commit">.*?<strong><a .*?>(.*?)</strong>'
                p_commit = re.compile(pat_commit, re.S).findall(item)
                p_commit = p_commit[0].replace('</a>', '')
                result += '"商品评论数":'+p_commit+'}\n'

                #获取商品评论
                commitResult = ''
                for j in range(0, 5):
                    #commitlink
                    commit_link = 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv0&productId='+pid[0]+'&score=0&sortType=6&page='+str(j)+'&pageSize=10'
                    
                    commitData = urllib.request.urlopen(commit_link).read().decode('gbk')
                    
                    pat_commit1 = '"content":"(.*?)"'
                    commitList = re.compile(pat_commit1, re.S).findall(commitData)
                    for k in range(0, len(commitList)):
                        commitResult += commitList[k]+'\n'
                        commitResult += '------------------------------------\n'
                        
                #保存商品的评论
                fm = open('./comment/'+str(y)+'-'+str(i)+'-'+pid[0]+'.txt', 'w', encoding="utf-8")
                fm.write(commitResult)
                fm.close()
                
        except Exception as err:
            write_error_log(err)
    
    fh = open('./result.txt', 'w', encoding='utf-8')
    fh.write(result)
    fh.close()
        


